suppressPackageStartupMessages(suppressWarnings({
library(simulateGP)
library(tidyverse)
library(data.table)
library(knitr)
}))
opts_chunk$set(cache=TRUE, echo=TRUE, message=FALSE, warning=FALSE)
Simulation function
runsim <- function(p)
{
map <- tibble(snp=1:p$nsnp, af=runif(p$nsnp, 0.01, 0.99))
gp <- generate_gwas_params(map, h2=p$h2, S = p$S) %>%
mutate(
rsq=2 * beta^2 * af * (1-af)
)
ss <- generate_gwas_ss(gp, nid=p$nid) %>%
mutate(tfval = (gp$beta/se)^2)
p$nsig <- sum(ss$pval < p$pval)
p$nsig_weak <- sum(ss$pval < p$pval & ss$tfval < 20)
p$nsig_overestimate <- sum(ss$pval < p$pval & (abs(ss$bhat) - 1.96 * ss$se) > abs(gp$beta))
p$min_rsq <- subset(ss, pval < p$pval) %>% {min(.$rsq)}
return(p)
}
Setup parameters
param <- expand.grid(
nid=seq(10000, 250000, by=10000),
nsnp=c(500, 1000, 5000, 10000, 50000, 100000),
h2=c(0.1, 0.5, 0.9),
pval=5e-8,
S=c(-1, 0, 1),
rep=1:10
) %>% as_tibble()
str(param)
## tibble [13,500 × 6] (S3: tbl_df/tbl/data.frame)
## $ nid : num [1:13500] 1e+04 2e+04 3e+04 4e+04 5e+04 6e+04 7e+04 8e+04 9e+04 1e+05 ...
## $ nsnp: num [1:13500] 500 500 500 500 500 500 500 500 500 500 ...
## $ h2 : num [1:13500] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ pval: num [1:13500] 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 ...
## $ S : num [1:13500] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ rep : int [1:13500] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "out.attrs")=List of 2
## ..$ dim : Named int [1:6] 25 6 3 1 3 10
## .. ..- attr(*, "names")= chr [1:6] "nid" "nsnp" "h2" "pval" ...
## ..$ dimnames:List of 6
## .. ..$ nid : chr [1:25] "nid= 10000" "nid= 20000" "nid= 30000" "nid= 40000" ...
## .. ..$ nsnp: chr [1:6] "nsnp=5e+02" "nsnp=1e+03" "nsnp=5e+03" "nsnp=1e+04" ...
## .. ..$ h2 : chr [1:3] "h2=0.1" "h2=0.5" "h2=0.9"
## .. ..$ pval: chr "pval=5e-08"
## .. ..$ S : chr [1:3] "S=-1" "S= 0" "S= 1"
## .. ..$ rep : chr [1:10] "rep= 1" "rep= 2" "rep= 3" "rep= 4" ...
Run simulations
l <- list()
for(i in 1:nrow(param))
{
l[[i]] <- runsim(param[i,])
}
l <- bind_rows(l)
What fraction of discovered SNPs will be significantly overestimated for a given minimum rsq value?
ggplot(l, aes(x=nid, y=nsig_overestimate / nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
ylim(c(0,1)) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ .)
And what fraction will be weak?
ggplot(l, aes(x=nid, y=nsig_weak / nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
Relationship between overestimates and weak? As power gets better more of the overestimates are still strong
ggplot(l, aes(x=nsig_overestimate, y=nsig_weak)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
How many just overestimated
ggplot(l, aes(x=nid, y=nsig_overestimate)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
ggplot(l, aes(x=nid, y=nsig_overestimate/nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
ggplot(l, aes(x=nid, y=min_rsq)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
ggplot(l, aes(x=nid, y=nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
ggplot(l, aes(x=nsig, y=nsig_overestimate/nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
load("../results/instrument_list.rdata")
counts <- fread("../results/count_tophits.txt")
instrument_counts <- subset(instrument_counts, !is.na(BETA.disc))
weak_p <- pf(10, 1, 100000, low=FALSE)
Quick summary
s1 <- instrument_counts %>%
group_by(id) %>%
summarise(
ndisc=sum(P_BOLT_LMM_INF.disc < 5e-8),
nrepl=sum(P_BOLT_LMM_INF.repl < 5e-8),
nhalf=mean(ndisc, nrepl),
ntot=sum(P_BOLT_LMM_INF.disc < 5e-8 | P_BOLT_LMM_INF.repl < 5e-8),
ndisc_weak=sum(P_BOLT_LMM_INF.disc < 5e-8 & P_BOLT_LMM_INF.repl > weak_p),
nrepl_weak=sum(P_BOLT_LMM_INF.repl < 5e-8 & P_BOLT_LMM_INF.disc > weak_p),
ndisc_overestimated=sum(P_BOLT_LMM_INF.disc < 5e-8 & (abs(BETA.disc) - 1.96 * SE.disc) > abs(BETA.repl)),
nrepl_overestimated=sum(P_BOLT_LMM_INF.repl < 5e-8 & (abs(BETA.repl) - 1.96 * SE.repl) > abs(BETA.disc)),
nhalf_overestimated=mean(ndisc_overestimated, nrepl_overestimated),
nhalf_weak=mean(ndisc_weak, nrepl_weak)
)
Number of traits with an instrument
sum(s1$nhalf > 0)
## [1] 590
Total number of instruments
sum(s1$nhalf)
## [1] 13673
sum(s1$ndisc)
## [1] 13673
sum(s1$nrepl)
## [1] 13723
sum(s1$ntot)
## [1] 18482
how many instruments per trait
summary(s1$ndisc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 12.13 4.00 477.00
summary(s1$nrepl)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 12.18 3.00 460.00
summary(s1$ndisc[s1$ndisc != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 3.00 23.17 11.00 477.00
summary(s1$nrepl[s1$nrepl != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 3.00 23.54 11.00 460.00
Number of weak instruments
sum(s1$nhalf_weak)
## [1] 734
sum(s1$ndisc_weak)
## [1] 734
sum(s1$nrepl_weak)
## [1] 628
Number of overestimated instruments
sum(s1$nhalf_overestimated)
## [1] 2623
sum(s1$ndisc_overestimated)
## [1] 2623
sum(s1$nrepl_overestimated)
## [1] 2555
Overestimated instruments
summary(s1$nhalf_overestimated[s1$nhalf != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 4.446 3.000 56.000
summary(s1$ndisc_overestimated[s1$ndisc != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 4.446 3.000 56.000
summary(s1$nrepl_overestimated[s1$nrepl != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 4.383 4.000 64.000
hist(s1$nhalf_overestimated / s1$nhalf, breaks=100)
hist(s1$nhalf_weak / s1$nhalf, breaks=100)
ggplot(s1, aes(x=ndisc, y=ndisc_overestimated / ndisc)) +
geom_point()
ggplot(s1, aes(x=ndisc, y=ndisc_weak / ndisc)) +
geom_point()
p1 <- bind_rows(
l %>% dplyr::select(nsig=nsig, nsnp=nsnp, nsig_overestimate) %>% mutate(what="Simulations"),
s1 %>% dplyr::select(nsig=ndisc, nsig_overestimate=ndisc_overestimated) %>% mutate(what="UK Biobank")
) %>%
{
ggplot(., aes(x=nsig, y=nsig_overestimate/nsig)) +
geom_point(data=subset(., what !="UK Biobank"), aes(colour=as.factor(nsnp)), size=0.1, alpha=0.5) +
geom_smooth(data=subset(., what !="UK Biobank"), aes(colour=as.factor(nsnp)), se=FALSE) +
geom_point(data=subset(., what =="UK Biobank")) +
scale_colour_brewer(type="seq", palette=2) +
# scale_x_log10() +
xlim(c(0,500)) +
labs(x="Discovered associations", y="Proportion overestimated", colour="Number of causal variants", shape="")
}
p1
s1 %>%
mutate(propover=ndisc_overestimated/ndisc, propweak=ndisc_weak/ndisc) %>%
dplyr::select(id, propover, propweak) %>%
pivot_longer(cols=c(propover, propweak)) %>%
ggplot(., aes(x=value)) +
geom_histogram(aes(fill=name), position="dodge", bins=100)
# geom_density(aes(fill=name), alpha=0.2)
ggplot(s1, aes(x=ndisc_overestimated/ndisc)) +
geom_histogram(bins=100)